{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import joblib\n",
    "import datetime as dt\n",
    "import numpy as np\n",
    "import os\n",
    "import gc\n",
    "import matplotlib as  mpl\n",
    "from matplotlib  import pyplot as plt\n",
    "mpl.rcParams[u'font.sans-serif'] = ['simhei']\n",
    "mpl.rcParams['axes.unicode_minus'] = False\n",
    "%matplotlib inline\n",
    "import seaborn as sns\n",
    "from joblib import Parallel, delayed"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.加载原始数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 12.7 s, sys: 14.9 s, total: 27.6 s\n",
      "Wall time: 27.7 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "input_dir = '../preprocess_data_new/'\n",
    "output_dir = '../preprocess_data_new/'\n",
    "train_ax = joblib.load(input_dir + 'train_ax.lz4')\n",
    "valid = joblib.load(input_dir + 'valid.lz4')\n",
    "test = joblib.load(input_dir + 'test.lz4')\n",
    "train_y = joblib.load(input_dir + 'train_y_33465.lz4')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.提取 loan_month, loan_day, loan_week_day"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def gen_mon_day(data):\n",
    "    '''\n",
    "    描述：\n",
    "        把'loan_dt'转化为loan_month、loan_day\n",
    "    参数：\n",
    "        input:\n",
    "            data(DataFrame): 有列loan_dt\n",
    "        output:\n",
    "            data(DataFrame)： 删除列loan_dt, 添加列loan_month和loan_day\n",
    "    '''\n",
    "    df_date = data[['loan_dt']].copy()\n",
    "    df_date.loan_dt = pd.to_datetime(df_date.loan_dt)\n",
    "    df_date['loan_month'] = df_date.loan_dt.apply(lambda x: x.month)\n",
    "    df_date['loan_day'] = df_date.loan_dt.apply(lambda x: x.day)\n",
    "    df_date['loan_week_day'] = df_date.loan_dt.apply(lambda x: x.weekday()+1)\n",
    "    df_date = df_date.drop(columns=['loan_dt'])\n",
    "    return df_date"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['../preprocess_data_new/train_ax_date.lz4']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 提取时间特征\n",
    "train_ax_date = gen_mon_day(train_ax)\n",
    "valid_date = gen_mon_day(valid)\n",
    "test_date = gen_mon_day(test)\n",
    "\n",
    "# 输出lz4文件\n",
    "joblib.dump(train_ax_date, output_dir + 'train_ax_date.lz4',compress='lz4')\n",
    "joblib.dump(valid_date, output_dir + 'valid_date.lz4',compress='lz4')\n",
    "joblib.dump(test_date, output_dir + 'test_date.lz4',compress='lz4')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.剔除列中只有单一重复的值,得到train_ax_nodup, valid_dup, test_dup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def judge_dup_col(ser,col,ratio):\n",
    "    '''\n",
    "    描述：\n",
    "        判断Series中重复值的个数是否大于ratio*Series的长度，\n",
    "        如果成立返回对应列名，反之返回None\n",
    "    '''\n",
    "    if ser.value_counts().iloc[0]>ratio*len(ser):\n",
    "        return col\n",
    "    else: \n",
    "        return None \n",
    "        \n",
    "def get_dup_col(df):\n",
    "    dup_col = Parallel(n_jobs=24,verbose=1)(delayed(judge_dup_col)(df[col],col,0.9) for col in df.columns )\n",
    "    return dup_col"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.\n",
      "[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    1.3s\n",
      "[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    2.0s\n",
      "[Parallel(n_jobs=24)]: Done 606 tasks      | elapsed:    3.7s\n",
      "[Parallel(n_jobs=24)]: Done 1306 tasks      | elapsed:    6.3s\n",
      "[Parallel(n_jobs=24)]: Done 2206 tasks      | elapsed:    9.7s\n",
      "[Parallel(n_jobs=24)]: Done 3306 tasks      | elapsed:   13.9s\n",
      "[Parallel(n_jobs=24)]: Done 4606 tasks      | elapsed:   18.7s\n",
      "[Parallel(n_jobs=24)]: Done 6106 tasks      | elapsed:   24.2s\n",
      "[Parallel(n_jobs=24)]: Done 6748 out of 6748 | elapsed:   26.6s finished\n",
      "[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.\n",
      "[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s\n",
      "[Parallel(n_jobs=24)]: Done 6747 out of 6747 | elapsed:    3.1s finished\n",
      "[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.\n",
      "[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s\n",
      "[Parallel(n_jobs=24)]: Done 6747 out of 6747 | elapsed:    3.1s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['../preprocess_data_new/dup_col_dict']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# train集中dup_col\n",
    "dup_col_tra = get_dup_col(train_ax)\n",
    "dup_col_tra = [col for col in dup_col_tra if col != None]\n",
    "# valid集中dup_col\n",
    "dup_col_val = get_dup_col(valid)\n",
    "dup_col_val = [col for col in dup_col_val if col != None]\n",
    "# test集中dup_col\n",
    "dup_col_test = get_dup_col(test)\n",
    "dup_col_test = [col for col in dup_col_test if col != None]\n",
    "\n",
    "# 保存 dup_col_dict\n",
    "dup_col_dict = {\n",
    "    'dup_col_tra': dup_col_tra,\n",
    "    'dup_col_val': dup_col_val,\n",
    "    'dup_col_test': dup_col_test\n",
    "}\n",
    "joblib.dump(dup_col_dict, output_dir + 'dup_col_dict')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1943"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# valid和test的数量较少，所以duplicated col的数量较多，取并集\n",
    "dup_col = list(set(dup_col_test).union(set(dup_col_tra).union(set(dup_col_val))))\n",
    "len(dup_col)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 21.6 s, sys: 3.65 s, total: 25.2 s\n",
      "Wall time: 43.4 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "train_ax_nodup = train_ax.drop(columns=dup_col)\n",
    "valid_nodup =valid.drop(columns=dup_col) \n",
    "test_nodup =test.drop(columns=dup_col) \n",
    "joblib.dump(train_ax_nodup, output_dir + 'train_ax_nodup.lz4', compress='lz4')\n",
    "joblib.dump(valid_nodup, output_dir + 'valid_nodup.lz4', compress='lz4') \n",
    "joblib.dump(test_nodup, output_dir + 'test_nodup.lz4', compress='lz4')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.提取n_null"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1.读取null_matrix_dict\n",
    "intput_dir = '../preprocess_data_new/'\n",
    "output_dir = '../preprocess_data_new/'\n",
    "null_matrix_dict = joblib.load(intput_dir + 'null_matrix_dict')\n",
    "row_null_sum_tra = null_matrix_dict['null_matrix_tra'].sum(axis=1)\n",
    "row_null_sum_val = null_matrix_dict['null_matrix_val'].sum(axis=1)\n",
    "row_null_sum_test = null_matrix_dict['null_matrix_test'].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 12 ms, sys: 0 ns, total: 12 ms\n",
      "Wall time: 12 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# 1.先产生新的特征 n_null\n",
    "def gen_null(row_null_sum):\n",
    "    df = pd.DataFrame()\n",
    "    df['n_null'] = row_null_sum.values\n",
    "    return df\n",
    "\n",
    "train_ax_null = gen_null(row_null_sum_tra)\n",
    "valid_null = gen_null(row_null_sum_val)\n",
    "test_null = gen_null(row_null_sum_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2.保存n_null,nd_null\n",
    "joblib.dump(train_ax_null, output_dir + 'train_ax_null.lz4', compress='lz4')\n",
    "joblib.dump(valid_null, output_dir + 'valid_null.lz4', compress='lz4')\n",
    "joblib.dump(test_null, output_dir + 'test_null.lz4', compress='lz4')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 处理缺失值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 分离 离散型特征 和 连续型特征\n",
    "def cat_and_cont(data):\n",
    "    data = data.drop(columns=['id'])\n",
    "    uniq = [[col,data[col].unique().shape[0]] for col in data.columns]\n",
    "    uniq_sort = sorted(uniq,key=lambda x: x[1],reverse=False)\n",
    "    cat_cols = [col for col,num in uniq_sort if num>1 and num<=20]\n",
    "    cont_cols = [col for col,num in uniq_sort if num>1 and num>20]\n",
    "    return cat_cols, cont_cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 9.92 s, sys: 6.41 s, total: 16.3 s\n",
      "Wall time: 16.4 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "cat_cols, cont_cols = cat_and_cont(train_ax_fea_null_tmp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 离散类型特征：null作为新的一类\n",
    "def gen_cat(data,cat_cols):\n",
    "    for col in cat_cols:\n",
    "        tmp,_ = pd.factorize(data[col])\n",
    "        data[col] = tmp\n",
    "    data_cat = data[['id']+cat_cols]\n",
    "    return data_cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.52 s, sys: 1.04 s, total: 3.56 s\n",
      "Wall time: 3.56 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "train_ax_cat = gen_cat(train_ax_fea_null_tmp,cat_cols)\n",
    "valid_cat = gen_cat(valid_fea_null_tmp,cat_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2min 40s, sys: 1.08 s, total: 2min 41s\n",
      "Wall time: 2min 41s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "train_ax_cat.to_csv('./preprocess_data_new/train_ax_cat.csv', index=False)\n",
    "valid_cat.to_csv('./preprocess_data_new/valid_cat.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 连续类型特征\n",
    "import pickle\n",
    "def gen_cont(data,cols,pkl_path = './preprocess_data_new/train_ax_fea_null.pkl'):\n",
    "    _,nd_null = pickle.load(open(pkl_path,'rb'))\n",
    "    new_cont_cols = []\n",
    "    count = 0\n",
    "    for col in cont_cols:\n",
    "        null_rank = nd_null['%s_nd_null'%col]\n",
    "        \n",
    "        if null_rank<=2:\n",
    "            med = data[col].median()\n",
    "            data[col] = data[col].fillna(med)\n",
    "            new_cont_cols.append(col)\n",
    "        elif null_rank==6:\n",
    "            tmp = (data[col].isnull()).values\n",
    "            tmp = tmp*1 # method 1\n",
    "            data['%s_dum'%col] = tmp\n",
    "            new_cont_cols.append('%s_dum'%col)\n",
    "            #tmp = tmp.astype(np.int64) # method2\n",
    "        else :\n",
    "            med = data[col].median()\n",
    "            data[col] = data[col].fillna(med) # 填充中位数\n",
    "            \n",
    "            tmp = (data[col].isnull()).values\n",
    "            tmp = tmp*1 \n",
    "            data['%s_dum'%col] = tmp # 增加一列指示变量\n",
    "            new_cont_cols.append(col)\n",
    "            new_cont_cols.append('%s_dum'%col)\n",
    "        count+=1\n",
    "        if count/100==0:\n",
    "            print(count/len(cont_cols))\n",
    "            \n",
    "    data_cont = data[new_cont_cols]\n",
    "    return data_cont\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 14.2 s, sys: 8.13 s, total: 22.4 s\n",
      "Wall time: 22.4 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "train_ax_cont = gen_cont(train_ax_fea_null_tmp,cont_cols,'./preprocess_data_new/train_ax_fea_null.pkl')\n",
    "valid_cont = gen_cont(valid_fea_null_tmp,cont_cols,'./preprocess_data_new/train_ax_fea_null.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_ax_cont.to_csv('./preprocess_data_new/train_ax_cont.csv', index=False)\n",
    "valid_cont.to_csv('./preprocess_data_new/valid_cont.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
